*-------------------------------------------------------------------------------
*						Data Pre-Process
*-------------------------------------------------------------------------------

** Set Path
global Raw_data    	"G:\project-finished\Descriptive\Data"
global App_data    	"G:\project-finished\Descriptive\Appendix Data"
global Class_data   "G:\project-finished\Descriptive\Classification"  
global Work_lab   	"G:\project-finished\Descriptive\Lab"
global Out_lab    	"G:\project-finished\Descriptive\Out"  

cd "$Work_lab"
                            
capture log close            
log using "$Out_lab\Pre-occ", replace 
set more off


**------------------------------------------------------------------------------
* Step1: Generate Data
*Crosswalk between CHN-OCC2010 & CHN-OCC2015
*(1)Create a Compliance occ list, CHN-consisent to link CHN-OCC2010 & CHN-OCC2015
*-Create a linkage between ONET-SOC2010 TO CHN-consisent
*-Merge CHN-consisent with different onet characteristics
**------------------------------------------------------------------------------
clear
clear all

//   GB/T6565-2009 with GB/T6565-2015
import excel "$Class_data\occ2015_match_occ2010.xlsx", sheet("Sheet1") firstrow allstring clear

drop G H I occ _2015

gen occ_temp=occ2010 if strlen(occ2010)==4
replace occ_temp=occ2010 if strlen(occ2010)==3 &strmatch(occ2010,"*/*")==0
replace occ_temp = subinstr(occ_temp, "-", "", .)
replace occ2010=occ_temp if occ_temp!=""


replace occ2010="" if occ2010=="missing"
replace occ_2015="" if occ_2015=="missing"

*DO NOT NEED TO ADJUST
gen num=_n
bys occ_2015 title_2015_all: egen exist15=count(num) if title_2015_all!=""
bys occ2010 title_2010:egen exist10=count(num) if title_2010!=""
sort occ_2015 occ2010

*drop "1-dig" or "3-dig" matching
*multi occ15 : 2-dig occ10 
*2dig occ15: 2dig occ10
*1dig occ15: 1dig occ10
drop if strlen(occ2010)==2 & exist15>1 
drop if strlen(occ2010)==2 & substr(occ_2015,4,5)=="00" 
drop if strmatch(occ2010,"*/*")==1 & substr(occ_2015,4,5)=="00" 
drop if strlen(occ2010)==1 & substr(occ_2015,2,5)=="0000" 
drop if substr(occ_2015,4,5)=="00" & occ2010==""

drop exist* num
gen num=_n
bys occ_2015 title_2015_all: egen exist15=count(num) if title_2015_all!=""
bys occ2010 title_2010:egen exist10=count(num) if title_2010!=""

*Define one to one ; one to multi; multi to one; multi to multi; one to no; no to one
gen tag=1 if exist10==1&exist15==1 
replace tag=2 if exist10==1&exist15>1 &exist15!=. 
replace tag=3 if exist15==1 &exist10>1 &exist10!=. 
replace tag=4 if exist10==.&exist15==1  
replace tag=5 if exist10==1 &exist15==.  
replace tag=6 if exist10>1&exist15>1 &exist15!=. &exist10!=. 


gen consistent=occ2010+"+"+occ_2015 if tag==1
replace consistent=occ_2015 if tag==2 &occ_2015!=""
replace consistent=occ2010 if tag==3 &occ2010!=""
replace consistent="no"+"+"+occ_2015 if tag==4
replace consistent= occ2010+"+"+"no" if tag==5

*Adjust occ15 and occ10 if involve "others"
drop if occ_2015=="40704" & occ2010=="449" 
drop if occ_2015=="40999" & occ2010=="529"
drop if occ_2015=="41203" & occ2010=="739"
drop if occ_2015=="41304" & occ2010=="449"
drop if (occ_2015=="61901"|occ_2015=="62001"|occ_2015=="62199"|occ_2015=="62202"|occ_2015=="62405"|occ_2015=="62504"|occ_2015=="62601") & occ2010=="699"
drop if (occ_2015=="61901"|occ_2015=="62001"|occ_2015=="62202"|occ_2015=="62405"|occ_2015=="62504"|occ_2015=="62601") & occ2010=="6-7/6-8/6-9"
drop if occ_2015=="62404" & occ2010=="739"
drop if occ_2015=="40299" & occ2010=="339"

*Adjust following occupations 
drop if occ_2015=="61199" & occ2010=="656"
replace consistent=occ_2015 if occ_2015=="61106" & occ2010=="656"

drop if occ_2015=="63000" & occ2010=="911"
replace consistent=occ_2015 if occ_2015=="63002" & occ2010=="911"

drop if occ_2015=="41303" & occ2010=="849"
replace consistent=occ_2015 if occ_2015=="41302" & occ2010=="849"

*"qi ta huanjing yu feiwu chuli renyuan" into "huanjing zhili fuwu renyuan"
drop if occ_2015=="40806" & occ2010=="929"
replace consistent=occ_2015 if occ_2015=="40907" & occ2010=="929"

*"qita boli he tangci zhipin shengchan renyuan" into "missing occ"
replace consistent=occ2010+"+"+"no" if occ2010=="839"
drop if occ_2015=="61505" & occ2010=="839"
drop if occ_2015=="61503" & occ2010=="839"
replace occ_2015="" if occ2010=="839"
replace title_2015_all="" if occ2010=="839"

*Define consistent occ
gen title_consistent=title_2015_all if consistent==occ_2015
replace title_consistent=title_2010 if consistent==occ2010
replace title_consistent=title_2015_all if substr(consistent,-5,.)==occ_2015
replace title_consistent=title_2010 if substr(consistent,-2,.)=="no"

keep occ_2015 title_2015_all occ2010 title_2010 consistent title_consistent

*Define 1dig and 2dig
gen occ_1dig=substr(occ_2015,1,1)
gen occ_2dig=substr(occ_2015,1,3) if substr(consistent,-5,.)==occ_2015

replace occ_1dig="1" if occ2010=="029"|occ2010=="039"
replace occ_1dig="6" if occ2010=="839"
replace occ_2dig="620" if occ2010=="839"
replace occ_2dig="107" if  occ2010=="029"|occ2010=="039"

replace occ_2dig="201" if consistent=="116"
replace occ_2dig="209" if consistent=="255"
replace occ_2dig="210" if consistent=="276"
replace occ_2dig="404" if consistent=="339"
replace occ_2dig="405" if consistent=="414"
replace occ_2dig="403" if consistent=="449"
replace occ_2dig="409" if consistent=="523"
replace occ_2dig="529" if consistent=="529"
replace occ_2dig="623" if consistent=="699"
replace occ_2dig="625" if consistent=="739"
replace occ_2dig="602" if consistent=="779"
replace occ_2dig="607" if consistent=="819"

save "occ2015_match_occ2010_consistent.dta",replace

*SAVE unique occ2015: consistent; unique occ2010: consistent
use "occ2015_match_occ2010_consistent.dta",clear
preserve
keep occ_2015 title_2015_all consistent title_consistent occ_1dig occ_2dig
duplicates drop
drop if occ_2015==""
unique occ_2015
save "occ2015_consistent.dta",replace
restore

preserve
keep occ2010 title_2010 consistent title_consistent occ_1dig occ_2dig
duplicates drop
drop if occ2010==""|title_2010==""
unique occ2010
save "occ2010_consistent.dta",replace
restore


**------------------------------------------------------------------------------
* Step1: Generate Data
*Crosswalk between CHN-OCC2010 & CHN-OCC2015
*-Create a Compliance occ list, CHN-consisent to link CHN-OCC2010 & CHN-OCC2015
*(2)Create a linkage between ONET-SOC2010 TO CHN-consisent
*-Merge CHN-consisent with different onet characteristics
**------------------------------------------------------------------------------
clear
set obs 0
save CHN-consistent_ONETSOC2010, replace emptyok


*CHN2010-ONET/SOC2010
import excel "$Raw_data\occupation2009-ONET.xlsx", sheet("census-ONET-original") firstrow allstring clear
drop E F G H I J

replace id=subinstr(id,"-","",.)
gen len=strlen(id)
tab len
keep if len==3
drop len
rename id occ2010
*CHN2010 - consistent
merge m:1 occ2010 using "occ2010_consistent.dta"
keep if _m==3
drop _m
keep occ2010 title_2010 consistent title_consistent onet_code onet_title occ_1dig occ_2dig

append using CHN-consistent_ONETSOC2010
save CHN-consistent_ONETSOC2010,replace


*CHN2015-ONET/SOC2010
import excel "$Raw_data\occ2015_sem.xls", sheet("2015match") firstrow allstring clear

*CHN2015 - consistent
drop E F G H I J K L M N
merge m:1 occ_2015 using "occ2015_consistent.dta"
drop if _m==1
drop _m

append using CHN-consistent_ONETSOC2010
save CHN-consistent_ONETSOC2010,replace

keep consistent title_consistent onet_code onet_title occ_1dig occ_2dig
duplicates drop

*drop if consis do not have onet-soc职 matching
gen num=_n
bys consistent title_consistent:egen temp = count(num)
drop if onet_code=="" & temp>1
drop temp num

save CHN-consistent_ONETSOC2010,replace


erase "occ2015_match_occ2010_consistent.dta"

**------------------------------------------------------------------------------
* Step2: Generate Data
*(1)Crosswalk between CHN-OCC2000 & CHN-OCC2010 (already consistent)
*(2)Create a linkage between ONET-SOC2009 TO CHN-consisent
**------------------------------------------------------------------------------
import excel "$Class_data\2009_to_2010_Crosswalk.xlsx", sheet("O-NET-SOC 2010 Occupation Listi") cellrange(A4:D1114) firstrow case(lower) allstring clear

bys onetsoc2010code:gen temp=_n
rename onetsoc2010code onet_code
levelsof temp
foreach i in `r(levels)'{
preserve
keep if temp==`i'
merge 1:m onet_code using CHN-consistent_ONETSOC2010
keep if _m==3
drop _m
save ONETSOC_09TO10_`i',replace
restore
}


use CHN-consistent_ONETSOC2010,clear
forvalues i =1(1)4{
append using ONETSOC_09TO10_`i'
}
keep consistent title_consistent occ_1dig occ_2dig onetsoc2009code onetsoc2009title
duplicates drop
sort consistent

gen num=1
bys consistent:egen temp=count(num)
drop if temp>1 & onetsoc2009code==""
drop num temp
save CHN-consistent_ONETSOC2009,replace

**------------------------------------------------------------------------------
* Step2: Generate Data
*(1)Crosswalk between CHN-OCC2000 & CHN-OCC2010 (already consistent)
*(2)Create a linkage between ONET-SOC2006 TO CHN-consisent
**------------------------------------------------------------------------------
import excel "$Class_data\2006_to_2009_Crosswalk.xlsx", sheet("O-NET-SOC 2009 Occupation Listi") cellrange(A4:D953) firstrow case(lower) allstring clear


bys onetsoc2009code:gen temp=_n
levelsof temp
foreach i in `r(levels)'{
preserve
keep if temp==`i'
merge 1:m onetsoc2009code using CHN-consistent_ONETSOC2009
keep if _m==3
drop _m
save ONETSOC_06TO09_`i',replace
restore
}



use CHN-consistent_ONETSOC2009,clear
forvalues i =1(1)1{
append using ONETSOC_06TO09_`i'
}
keep consistent title_consistent occ_1dig occ_2dig onetsoc2006code onetsoc2006title
duplicates drop
sort consistent

gen num=1
bys consistent:egen temp=count(num)
drop if temp>1 & onetsoc2006code==""
drop num temp
save CHN-consistent_ONETSOC2006,replace

**------------------------------------------------------------------------------
* Step2: Generate Data
*(1)Crosswalk between CHN-OCC2000 & CHN-OCC2010 (already consistent)
*(2)Create a linkage between ONET-SOC2000 TO CHN-consisent

**------------------------------------------------------------------------------
import excel "$Class_data\2000_to_2006_Crosswalk.xlsx", sheet("O-NET-SOC 2006 Occupation Listi") cellrange(A4:D1171) firstrow case(lower) allstring clear


bys onetsoc2006code:gen temp=_n
levelsof temp
foreach i in `r(levels)'{
preserve
keep if temp==`i'
merge 1:m onetsoc2006code using CHN-consistent_ONETSOC2006
keep if _m==3
drop _m
save ONETSOC_00TO06_`i',replace
restore
}


use CHN-consistent_ONETSOC2006,clear
forvalues i =1(1)14{
append using ONETSOC_00TO06_`i'
}
keep consistent title_consistent occ_1dig occ_2dig onetsoc2000code onetsoc2000title
duplicates drop
sort consistent

gen num=1
bys consistent:egen temp=count(num)
drop if temp>1 & onetsoc2000code==""
drop num temp

*Make some adj
replace onetsoc2000code="29-1199.99" if title_consistent=="中西医结合医师"
replace onetsoc2000code="29-1069.99" if title_consistent=="公共卫生辅助服务人员"
replace onetsoc2000code="29-1069.99" if title_consistent=="公共卫生辅助服务人员"
replace onetsoc2000code="51-8099.99" if title_consistent=="农村能源利用人员"
replace onetsoc2000code="17-2199.99" if title_consistent=="标准化、计量、质量和认证认可工程技术人员"
replace onetsoc2000code="11-9151.00" if title_consistent=="民办非企业单位负责人"
replace onetsoc2000code="11-9121.00" if title_consistent=="水产技术人员"
replace onetsoc2000code="17-1022.00" if title_consistent=="水文服务人员"
replace onetsoc2000code="23-1011.00" if title_consistent=="法律顾问"
replace onetsoc2000code="17-2199.99" if title_consistent=="电力工程技术人员"
save CHN-consistent_ONETSOC2000,replace



forvalues i =1(1)4{
erase ONETSOC_09TO10_`i'.dta
}
forvalues i =1(1)1{
erase ONETSOC_06TO09_`i'.dta
}
forvalues i =1(1)14{
erase ONETSOC_00TO06_`i'.dta
}


**------------------------------------------------------------------------------
/*							OUT PUT File
			$Raw_lab\occ2015_consistent
			$Raw_lab\occ2010_consistent
			CHN-consistent_ONETSOC2010
			CHN-consistent_ONETSOC2009
			CHN-consistent_ONETSOC2006
			CHN-consistent_ONETSOC2000
*/
**------------------------------------------------------------------------------


log close


